packages.used=c("tibble",
"sentimentr", "gplots", "dplyr",
"tm", "syuzhet", "factoextra",
"scales", "RColorBrewer",
"RANN", "topicmodels", "stringr", "wordcloud", "tidytext")
# check packages that need to be installed.
packages.needed=setdiff(packages.used,
intersect(installed.packages()[,1],
packages.used))
# install additional packages
if(length(packages.needed)>0){
install.packages(packages.needed, dependencies = TRUE)
}
# load packages
library("tibble")
library("syuzhet")
library("sentimentr")
library("gplots")
library("dplyr")
library("tm")
library("syuzhet")
library("factoextra")
library("scales")
library("RColorBrewer")
library("RANN")
library("tm")
library("topicmodels")
library("stringr")
library("wordcloud")
library("tidytext")
This notebook was prepared with the following environmental settings.
print(R.version)
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## system x86_64, mingw32
## status
## major 4
## minor 1.1
## year 2021
## month 08
## day 10
## svn rev 80725
## language R
## version.string R version 4.1.1 (2021-08-10)
## nickname Kick Things
df <- read.csv("philosophy_data.csv")
dim(df)
## [1] 360808 11
unique(df$author)
## [1] "Plato" "Aristotle" "Locke" "Hume"
## [5] "Berkeley" "Spinoza" "Leibniz" "Descartes"
## [9] "Malebranche" "Russell" "Moore" "Wittgenstein"
## [13] "Lewis" "Quine" "Popper" "Kripke"
## [17] "Foucault" "Derrida" "Deleuze" "Merleau-Ponty"
## [21] "Husserl" "Heidegger" "Kant" "Fichte"
## [25] "Hegel" "Marx" "Lenin" "Smith"
## [29] "Ricardo" "Keynes" "Epictetus" "Marcus Aurelius"
## [33] "Nietzsche" "Wollstonecraft" "Beauvoir" "Davis"
unique(df$school)
## [1] "plato" "aristotle" "empiricism" "rationalism"
## [5] "analytic" "continental" "phenomenology" "german_idealism"
## [9] "communism" "capitalism" "stoicism" "nietzsche"
## [13] "feminism"
df %>%
group_by(school) %>%
summarise(author_num = n_distinct(author))
## # A tibble: 13 x 2
## school author_num
## <chr> <int>
## 1 analytic 7
## 2 aristotle 1
## 3 capitalism 3
## 4 communism 2
## 5 continental 3
## 6 empiricism 3
## 7 feminism 3
## 8 german_idealism 3
## 9 nietzsche 1
## 10 phenomenology 3
## 11 plato 1
## 12 rationalism 4
## 13 stoicism 2
The dataset I use is the Philosophy dataset. This dataset contains 59 articles, which were written by 36 famous philosophers from 13 schools. As the dataset has already been cleaned with no missing values or abnormal inputs, there is no need for extra data cleaning.
In this project, I will use sentences as units of analysis for this project, as sentences are natural languge units for organizing thoughts and ideas. I apply sentiment analysis using NRC Emotion Lexicon. “The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). The annotations were manually done by crowdsourcing.”
# data with number of tokens
word_count <- function(str){
return(str_count(str, '\\w+'))
}
token_data <- df %>%
mutate(n_tokens = word_count(df$tokenized_txt))
# Sentiments for each sentence
sentiment_score <- function(df){
sentence <- df %>%
select(c("title", "author", "school", "sentence_str",
"original_publication_date")) %>%
mutate(id = 1:nrow(df))
sentence$sentence_str <- as.character(sentence$sentence_str)
emotions <- get_nrc_sentiment(sentence$sentence_str)
sentence <- bind_cols(sentence, emotions)
output_dp <- "sentence_emotion.csv"
write.csv(sentence, output_dp)
return(output_dp)
}
# FInd a maximum index
random_max <- function(a){
sample(which(a == max(a)), 1)
}
# Get top emotion and its score
top_emotion <- function(processed_df){
sentence <- read.csv(processed_df)
# Get the top emotion of each sentence
sentence$top <- sentence %>%
select(anger:trust) %>%
apply(1, random_max)
# Get the emotion score for the top emotion
sentence$top.a <- sentence %>%
select(anger:trust) %>%
apply(1, max)
sentence$top[sentence$top.a == 0] <- 9
return(sentence)
}
# Get top sentiment and its score
sentence_sentiment <- function(processed_df){
sentence <- read.csv(processed_df)
# Get the sentiment of each sentence
sentence$sentiment <- sentence %>%
select(negative:positive) %>%
apply(1, random_max)
# Get the sentiment score
sentence$sentiment.a <- sentence %>%
select(negative:positive) %>%
apply(1, max)
sentence$sentiment[sentence$sentiment.a == 0] <- 3
return(sentence)
}
# processed_df <- sentiment_score(df)
processed_df <- "sentence_emotion.csv"
emotion_score <- read.csv("sentence_emotion.csv")
head(emotion_score)
## title author school
## 1 Plato - Complete Works Plato plato
## 2 Plato - Complete Works Plato plato
## 3 Plato - Complete Works Plato plato
## 4 Plato - Complete Works Plato plato
## 5 Plato - Complete Works Plato plato
## 6 Plato - Complete Works Plato plato
## sentence_str
## 1 What's new, Socrates, to make you leave your usual haunts in the Lyceum and spend your time here by the king archon's court?
## 2 Surely you are not prosecuting anyone before the king archon as I am?
## 3 The Athenians do not call this a prosecution but an indictment, Euthyphro.
## 4 What is this you say?
## 5 Someone must have indicted you, for you are not going to tell me that you have indicted someone else.
## 6 But someone else has indicted you?
## original_publication_date id anger anticipation disgust fear joy sadness
## 1 -350 1 1 2 0 1 0 1
## 2 -350 2 0 0 0 0 0 0
## 3 -350 3 0 0 1 1 0 0
## 4 -350 4 0 0 0 0 0 0
## 5 -350 5 0 0 0 0 0 0
## 6 -350 6 0 0 0 0 0 0
## surprise trust negative positive
## 1 1 1 1 2
## 2 0 0 0 1
## 3 0 0 2 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
top_emotion <- top_emotion(processed_df)
head(top_emotion)
## title author school
## 1 Plato - Complete Works Plato plato
## 2 Plato - Complete Works Plato plato
## 3 Plato - Complete Works Plato plato
## 4 Plato - Complete Works Plato plato
## 5 Plato - Complete Works Plato plato
## 6 Plato - Complete Works Plato plato
## sentence_str
## 1 What's new, Socrates, to make you leave your usual haunts in the Lyceum and spend your time here by the king archon's court?
## 2 Surely you are not prosecuting anyone before the king archon as I am?
## 3 The Athenians do not call this a prosecution but an indictment, Euthyphro.
## 4 What is this you say?
## 5 Someone must have indicted you, for you are not going to tell me that you have indicted someone else.
## 6 But someone else has indicted you?
## original_publication_date id anger anticipation disgust fear joy sadness
## 1 -350 1 1 2 0 1 0 1
## 2 -350 2 0 0 0 0 0 0
## 3 -350 3 0 0 1 1 0 0
## 4 -350 4 0 0 0 0 0 0
## 5 -350 5 0 0 0 0 0 0
## 6 -350 6 0 0 0 0 0 0
## surprise trust negative positive top top.a
## 1 1 1 1 2 1 2
## 2 0 0 0 1 9 0
## 3 0 0 2 0 3 1
## 4 0 0 0 0 9 0
## 5 0 0 0 0 9 0
## 6 0 0 0 0 9 0
sentence_sentiment <- sentence_sentiment(processed_df)
head(sentence_sentiment)
## title author school
## 1 Plato - Complete Works Plato plato
## 2 Plato - Complete Works Plato plato
## 3 Plato - Complete Works Plato plato
## 4 Plato - Complete Works Plato plato
## 5 Plato - Complete Works Plato plato
## 6 Plato - Complete Works Plato plato
## sentence_str
## 1 What's new, Socrates, to make you leave your usual haunts in the Lyceum and spend your time here by the king archon's court?
## 2 Surely you are not prosecuting anyone before the king archon as I am?
## 3 The Athenians do not call this a prosecution but an indictment, Euthyphro.
## 4 What is this you say?
## 5 Someone must have indicted you, for you are not going to tell me that you have indicted someone else.
## 6 But someone else has indicted you?
## original_publication_date id anger anticipation disgust fear joy sadness
## 1 -350 1 1 2 0 1 0 1
## 2 -350 2 0 0 0 0 0 0
## 3 -350 3 0 0 1 1 0 0
## 4 -350 4 0 0 0 0 0 0
## 5 -350 5 0 0 0 0 0 0
## 6 -350 6 0 0 0 0 0 0
## surprise trust negative positive sentiment sentiment.a
## 1 1 1 1 2 2 2
## 2 0 0 0 1 1 1
## 3 0 0 2 0 1 2
## 4 0 0 0 0 3 0
## 5 0 0 0 0 3 0
## 6 0 0 0 0 3 0
We can see emotion score for each sentence.
First, I want to know which school is the most productive, i.e how many articles each school publish? And how many sentences each school write?
# title_per_school
title_per_school <- df %>%
group_by(school) %>%
summarise(title_per_school = n_distinct(title)) %>%
arrange(desc(title_per_school))
title_per_school
## # A tibble: 13 x 2
## school title_per_school
## <chr> <int>
## 1 analytic 11
## 2 german_idealism 7
## 3 continental 6
## 4 empiricism 6
## 5 rationalism 6
## 6 nietzsche 5
## 7 phenomenology 5
## 8 capitalism 3
## 9 communism 3
## 10 feminism 3
## 11 stoicism 2
## 12 aristotle 1
## 13 plato 1
title_per_school %>%
ggplot(aes(x = school, y = title_per_school)) +
geom_bar(stat = "identity", fill="#f68060", alpha=.6, width=.4) +
coord_flip() +
xlab("School") +
ylab("Number of Titles") +
theme_bw() +
labs(title = "Number of Titles per school")
We can find “analytic” published most articles, followed by “german_idealism”, while “aristotle” and “plato” had the least titles.
# sentence_per_school
sentence_per_school <- df %>%
group_by(school) %>%
summarise(sentence_per_school = n_distinct(sentence_str)) %>%
arrange(desc(sentence_per_school))
sentence_per_school
## # A tibble: 13 x 2
## school sentence_per_school
## <chr> <int>
## 1 analytic 55425
## 2 aristotle 48779
## 3 german_idealism 42136
## 4 plato 38366
## 5 continental 33779
## 6 phenomenology 28573
## 7 rationalism 22949
## 8 empiricism 19931
## 9 feminism 18635
## 10 capitalism 18194
## 11 communism 17958
## 12 nietzsche 13548
## 13 stoicism 2535
sentence_per_school %>%
ggplot(aes(x = school, y = sentence_per_school)) +
geom_bar(stat = "identity", fill="#f68060", alpha=.6, width=.4) +
coord_flip() +
xlab("School") +
ylab("Number of Sentences") +
theme_bw() +
labs(title = "Number of Sentences per School")
How about sentences?
We can see from the graph that “analytic” and “aristotle” dominated in the number of sentences contained in the dataset, which “stoicism” published much less sentences than other schools.
Is there any Correlation between number of sentences and number of articles?
As we always think about, the more articles each school published, the more sentences each school published. However, as we found before, the more number of sentences does not mean more titles. What was really surprising is that “Aristotle” only had one article in the dataset, it ranked the second in the number of sentences. “Plato” was also a surprise to me. So that I want to know how many sentences there was in each articles.
# sentence_per_title
sentence_per_title <- df %>%
group_by(title) %>%
summarise(sentence_per_title = n_distinct(sentence_str)) %>%
arrange(desc(sentence_per_title))
sentence_per_title %>%
ggplot(aes(x = title, y = sentence_per_title)) +
geom_bar(stat = "identity", fill="#f68060", alpha=.6, width=.4) +
coord_flip() +
xlab("Title") +
ylab("Number of Sentences") +
theme_bw() +
labs(title = "Number of Sentences per Title") +
theme(axis.text.y = element_text(size = 5))
Not surprising, the number of sentences in the article of “Aristotle” and “Plato” were remarkable.
What if we see the data in the time frame? Does the length of sentences vary over the year?
sentence_length_year <- df %>%
group_by(original_publication_date) %>%
summarize(mean_sentence_length = mean(sentence_length)) %>%
arrange(desc(mean_sentence_length))
# visualize
ggplot(sentence_length_year, aes(x = original_publication_date, y = mean_sentence_length)) +
geom_line() +
labs(title = "Length of Sentences Over the Years")
We can see that in the early days, the sentences were usually brief, as the time went by, they became more and more rich, reaching peak at around 1600s. Then a sharp decrease, followed by fluctuation until now.
How did the emotion distribute in the philosophy texts?
emotion <- c("anticipation", "joy", "surprise", "trust", "anger",
"disgust", "fear", "sadness", "neutral")
col <- c("pink", "purple", "orange", "brown", "red", "green", "yellow", "blue", "black")
top_emotion %>% ggplot(aes(x = top)) +
geom_bar(aes(y = (..count..)/sum(..count..)),
fill = col) +
scale_x_discrete(limits = emotion) +
scale_y_continuous(labels = scales::percent,
limits = c(0, .3)) +
labs(title = "Overall Emotions In the Philosophy Texts",
x = "Emotion", y = "Percentage of Sentences") +
theme_linedraw()
The graph shows overall emotions that distributed in all philosophy texts. Less than 30% the philosophy texts were neutral with no emotion in it, which was a little surprising, since as I previously thought, philosophical texts are plain with lots of terminology. Besides, anticipation and joy are the major types of emotions expressed in the philosophical texts, taking around 30% of the overall sentences.
How about the sentiment?
sentiment <- c("negative", "positive", "neutral")
col_sentiment <- c("red", "green", "black")
sentence_sentiment %>% ggplot(aes(x = sentiment)) +
geom_bar(aes(y = (..count..)/sum(..count..)),
fill = col_sentiment) +
scale_x_discrete(limits = sentiment) +
scale_y_continuous(labels = scales::percent,
limits = c(0, .5)) +
labs(title = "Overall Sentiment In the Philosophy Texts",
x = "Sentiment", y = "Percentage of Sentences") +
theme_linedraw()
We could find that negative took up about half of the overall philosophy texts. A thing that is a bit weird is that when we analyzed emotions, we found anticipation and joy are the major types of emotions, while when considering about the sentiment, we found negative dominated. Does it mean that emotion is not equal to sentiment?
Is there any difference in each school? Since philosophers from different schools usually had different even contradicting ideas and beliefs, so how about emotions or sentiment?
Do philosophers from different schools of philosophy tend to express different emotions or sentiment in their articles?
top_emotion$top <- as.factor(top_emotion$top)
top_emotion$school <- as.factor(top_emotion$school)
levels(top_emotion$top) <- emotion
top_emotion %>% group_by(school) %>%
count(top) %>% mutate(per = n / sum(n)) %>%
ggplot(aes(fill = top, x = school, y = per)) +
geom_bar(stat="identity", position="stack") +
scale_fill_manual(values = col) +
labs(title = "Percentage of Emotions for Each School",
x = "School", y = "Percentage") +
scale_y_continuous(labels = scales::percent) +
theme_linedraw() +
theme(legend.title = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
As shown in graph, the differences in terms of emotions are little across 13 schools. This suggests that on average, philosophers from different schools tended to express the similar types of emotion even when they had very different thoughts or ideas.
sentence_sentiment$sentiment <- as.factor(sentence_sentiment$sentiment)
sentence_sentiment$school <- as.factor(sentence_sentiment$school)
levels(sentence_sentiment$sentiment) <- sentiment
sentence_sentiment %>% group_by(school) %>%
count(sentiment) %>% mutate(per = n / sum(n)) %>%
ggplot(aes(fill = sentiment, x = school, y = per)) +
geom_bar(stat="identity", position="stack") +
scale_fill_manual(values = col_sentiment) +
labs(title = "Percentage of Emotions for Each School",
x = "School", y = "Percentage") +
scale_y_continuous(labels = scales::percent) +
theme_linedraw() +
theme(legend.title = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
So did sentiment.
word_cloud <- function(data,select_school){
text <- data %>%
select(school, author, title, sentence_lowered, original_publication_date) %>%
filter(school == select_school )
# Create a corpus
my_custom_stopwords <- c("one", "will", "may","things", "say",
"can", "now", "even", "also", "must","whether")
corpus <- VCorpus(VectorSource(text$sentence_lowered))
corpus<- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, my_custom_stopwords)
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(words = names(v), freq=v)
analytic <- d
set.seed(20220202)
wordcloud(words = d$word, freq = d$freq, min.freq = 10,
max.words=150, random.order=FALSE, rot.per=0.3,
colors=brewer.pal(8, "Dark2"), scale = c(4, 0.5))
}
word_cloud(df,"aristotle")
word_cloud(df,"plato")
In the theory of Forms, Plato presented a solution to the problem of universals. he used lots of words such as “think”, “soul”, “good”.
word_cloud(df,"empiricism")
There is no surprising that “idea”, “mind”, “knowledge” appears a lot in the articles of empiricism.
word_cloud(df,"rationalism")
word_cloud(df,"analytic")
For philosophers in the school of analytic, they attached importance to “true”, “theory”, “fact”, “truth”.
word_cloud(df,"continental")
word_cloud(df,"phenomenology")
For philosophers in the school of phenomenology, they attached importance to “world”, “experience”, “consciousness”, “object”.
word_cloud(df,"german_idealism")
word_cloud(df,"communism")
For philosophers in the school of communism, they attached importance to “labour”, “power”, “production”, “value”.
word_cloud(df,"capitalism")
For philosophers in the school of capitalism, they attached importance to “price”, “money” and “capital”.
word_cloud(df,"stoicism")
word_cloud(df,"nietzsche")
word_cloud(df,"feminism")
“Woman” contributes most in the articles of feminism.
Interested in feminism, how did scholar think about feminism as the time when by?
greek <- df %>%
select(school, author, title, sentence_lowered, original_publication_date) %>%
filter(original_publication_date==-350|original_publication_date==-320) %>%
filter(school == "feminism" )
modern <- df %>%
select(school, author, title, sentence_lowered, original_publication_date) %>%
filter(original_publication_date>1600&original_publication_date<=1800) %>%
filter(school == "feminism" )
contemporary <- df %>%
select(school, author, title, sentence_lowered, original_publication_date) %>%
filter(original_publication_date>1800) %>%
filter(school == "feminism" )
wordcloud_time <- function(time_data){
# Create a corpus
my_custom_stopwords <- c("one", "will", "may","things", "say",
"can", "now", "even", "also", "must","whether")
corpus <- VCorpus(VectorSource(time_data$sentence_lowered))
corpus<- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, my_custom_stopwords)
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(words = names(v), freq=v)
analytic <- d
wordcloud(words = d$word, freq = d$freq, min.freq = 10,
max.words=150, random.order=FALSE, rot.per=0.3,
colors=brewer.pal(8, "Dark2"), scale = c(4, 0.5))
}
wordcloud_time(modern)
wordcloud_time(modern)
From the words cloud for feminism in modern philosophy, we can see that philosophers talked a lot about nature, love, respect and mind, which related more about psychology.
wordcloud_time(contemporary)
In contemporary philosophy, there are more words related to science and responsibility, such as “husband”, “wife”, “black”, “white”. Philosophers focus more on rational and logical sides.
There is no correlation between the number of sentences and the number of articles each school published.
A large part of sentences in the philosophical articles are emotional neutral, while anticipation and joy took a large part. As for sentiment, proportions of negative sentences are larger than that of positive. Besides, philosophers from different schools tend to express similar emotion and sentiment even when they had different thoughts.
There are similarities between different schools and philosophers, and the ideas of philosophers also changed as the time went by.
Although due to the limitations of the data set, the data is still biased to some extent, since it did not include all the philosophers and their articles. Besides, there many sentences have mixed emotions or sentiments which need more specific analysis such as aspected-based classification, based on sentence is not enough.